# load packages
library(readr)
library(dplyr)
library(ggplot2)
library(gridExtra)
library(corrplot)
library(plotly)
library(reshape2)
library(car)
library(kableExtra)
library(broom)
library(knitr)
library(pROC)
library(ggpattern)
library(tidyr)
library(ResourceSelection)
library(sf)
library(arm)

Setting up Data

# import dataset
female <- read_csv("/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/Data/wm.csv")
# get a glimpse of dataset
head(female)
## # A tibble: 6 × 458
##     HH1   HH2    LN   WM1   WM2   WM3 WMINT   WM4   WM5  WM6D  WM6M  WM6Y   WM8
##   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     1     2     3     1     2     3    92    91    92    19    11  2020     2
## 2     1     4     2     1     4     2    92    91    92    18    11  2020     1
## 3     1     9     4     1     9     4    92    91    92    18    11  2020     1
## 4     1    10     4     1    10     4    92    91    92    18    11  2020     2
## 5     1    11     4     1    11     4    92    91    92    19    11  2020     2
## 6     1    11     5     1    11     5    92    91    92    18    11  2020     2
## # ℹ 445 more variables: WM9 <dbl>, WM17 <dbl>, WM7H <dbl>, WM7M <dbl>,
## #   WM10H <dbl>, WM10M <dbl>, WM11 <dbl>, WM12 <dbl>, WM13 <dbl>, WM14 <dbl>,
## #   WM15 <dbl>, WM22 <dbl>, WM23 <dbl>, WM24 <dbl>, WMHINT <dbl>, WMFIN <dbl>,
## #   WB3M <dbl>, WB3Y <dbl>, WB4 <dbl>, WB5 <dbl>, WB6A <dbl>, WB6B <dbl>,
## #   WB7 <dbl>, WB9 <lgl>, WB10A <lgl>, WB10B <lgl>, WB11 <lgl>, WB12A <lgl>,
## #   WB12B <lgl>, WB14 <dbl>, WB15 <dbl>, WB16 <dbl>, WB17 <dbl>, WB18 <dbl>,
## #   WB19A <chr>, WB19B <chr>, WB19C <chr>, WB19D <chr>, WB19E <chr>, …
# Select the specified columns to create a new dataframe
female_df <- dplyr::select(female, WAGEM, MSTATUS, HH6, HH7, welevel, insurance, ethnicity, windex5, CP2, HA1, MT4, MT9, MT11)

# View the first few rows of the new dataframe
summary(female_df)
##      WAGEM          MSTATUS           HH6             HH7           welevel    
##  Min.   :10.00   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :0.00  
##  1st Qu.:18.00   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000   1st Qu.:1.00  
##  Median :20.00   Median :1.000   Median :2.000   Median :3.000   Median :2.00  
##  Mean   :20.92   Mean   :1.413   Mean   :1.683   Mean   :3.404   Mean   :2.46  
##  3rd Qu.:23.00   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:5.000   3rd Qu.:3.00  
##  Max.   :47.00   Max.   :9.000   Max.   :2.000   Max.   :6.000   Max.   :9.00  
##  NA's   :2026    NA's   :11      NA's   :11      NA's   :11      NA's   :524   
##    insurance       ethnicity        windex5           CP2       
##  Min.   :1.000   Min.   :1.000   Min.   :0.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :1.000   Median :1.000   Median :2.000   Median :1.000  
##  Mean   :1.135   Mean   :2.035   Mean   :2.494   Mean   :1.431  
##  3rd Qu.:1.000   3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :9.000   Max.   :6.000   Max.   :5.000   Max.   :9.000  
##  NA's   :524                                     NA's   :864    
##       HA1             MT4             MT9             MT11     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.00  
##  Median :1.000   Median :2.000   Median :1.000   Median :1.00  
##  Mean   :1.215   Mean   :1.664   Mean   :1.365   Mean   :1.12  
##  3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:1.00  
##  Max.   :9.000   Max.   :9.000   Max.   :9.000   Max.   :9.00  
##  NA's   :524     NA's   :524     NA's   :2496    NA's   :524
# Rename the columns
female_df <- female_df %>%
  rename(
  age_first_marriage = WAGEM,
  marital_status = MSTATUS,
  area = HH6,
  region = HH7,
  education_level = welevel,
  health_insurance = insurance,
  ethnicity = ethnicity,
  wealth_index = windex5,
  current_contraceptive_use = CP2,
  awareness_hiv_aids = HA1,
  used_computer_tablet = MT4,
  used_internet = MT9,
  owns_mobile_phone = MT11
)
# Summarize missing values by column
summarize_missing <- sapply(female_df, function(x) sum(is.na(x)))
print(summarize_missing)
##        age_first_marriage            marital_status                      area 
##                      2026                        11                        11 
##                    region           education_level          health_insurance 
##                        11                       524                       524 
##                 ethnicity              wealth_index current_contraceptive_use 
##                         0                         0                       864 
##        awareness_hiv_aids      used_computer_tablet             used_internet 
##                       524                       524                      2496 
##         owns_mobile_phone 
##                       524
# Recode the value 9 to NA for specified variables
female_df <- female_df %>%
  mutate(
    current_contraceptive_use = na_if(current_contraceptive_use, 9),
    used_internet = na_if(used_internet, 9),
    health_insurance = na_if(health_insurance, 9),
    education_level = na_if(education_level, 9),
    awareness_hiv_aids = na_if(awareness_hiv_aids, 9),
    used_computer_tablet = na_if(used_computer_tablet, 9),
    owns_mobile_phone = na_if(owns_mobile_phone, 9),
    marital_status = na_if(marital_status, 9)
  )

# Recode the value 6 to NA for 'ethnicity'
female_df$ethnicity <- na_if(female_df$ethnicity, 6)

# Recode the value 0 to NA for 'wealth_index'
female_df$wealth_index <- na_if(female_df$wealth_index, 0)
# Recoding variables from 1 (Yes) and 2 (No) to 1 (Yes) and 0 (No)
female_df$health_insurance <- ifelse(female_df$health_insurance == 2, 0, female_df$health_insurance)
female_df$current_contraceptive_use <- ifelse(female_df$current_contraceptive_use == 2, 0, female_df$current_contraceptive_use)
female_df$awareness_hiv_aids <- ifelse(female_df$awareness_hiv_aids == 2, 0, female_df$awareness_hiv_aids)
female_df$used_computer_tablet <- ifelse(female_df$used_computer_tablet == 2, 0, female_df$used_computer_tablet)
female_df$owns_mobile_phone <- ifelse(female_df$owns_mobile_phone == 2, 0, female_df$owns_mobile_phone)
female_df$used_internet <- ifelse(female_df$used_internet == 2, 0, female_df$used_internet)
# View the changes to ensure NA substitution has been correctly applied
summary(female_df)
##  age_first_marriage marital_status       area           region     
##  Min.   :10.00      Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:18.00      1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000  
##  Median :20.00      Median :1.000   Median :2.000   Median :3.000  
##  Mean   :20.92      Mean   :1.408   Mean   :1.683   Mean   :3.404  
##  3rd Qu.:23.00      3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:5.000  
##  Max.   :47.00      Max.   :3.000   Max.   :2.000   Max.   :6.000  
##  NA's   :2026       NA's   :18      NA's   :11      NA's   :11     
##  education_level health_insurance   ethnicity      wealth_index  
##  Min.   :0.00    Min.   :0.0000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.00    1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :2.00    Median :1.0000   Median :1.000   Median :2.000  
##  Mean   :2.46    Mean   :0.8659   Mean   :1.586   Mean   :2.615  
##  3rd Qu.:3.00    3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:4.000  
##  Max.   :5.00    Max.   :1.0000   Max.   :4.000   Max.   :5.000  
##  NA's   :525     NA's   :525      NA's   :1148    NA's   :524    
##  current_contraceptive_use awareness_hiv_aids used_computer_tablet
##  Min.   :0.0000            Min.   :0.0000     Min.   :0.0000      
##  1st Qu.:0.0000            1st Qu.:1.0000     1st Qu.:0.0000      
##  Median :1.0000            Median :1.0000     Median :0.0000      
##  Mean   :0.5842            Mean   :0.7975     Mean   :0.3412      
##  3rd Qu.:1.0000            3rd Qu.:1.0000     3rd Qu.:1.0000      
##  Max.   :1.0000            Max.   :1.0000     Max.   :1.0000      
##  NA's   :885               NA's   :541        NA's   :532         
##  used_internet    owns_mobile_phone
##  Min.   :0.0000   Min.   :0.0000   
##  1st Qu.:0.0000   1st Qu.:1.0000   
##  Median :1.0000   Median :1.0000   
##  Mean   :0.6394   Mean   :0.8831   
##  3rd Qu.:1.0000   3rd Qu.:1.0000   
##  Max.   :1.0000   Max.   :1.0000   
##  NA's   :2501     NA's   :529

Creating New Variable Access to Media

# Combine individual variables for access to the internet, phone, and computer into a single variable
# This new variable "access_to_media" will have a value of 1 if the respondent has access to any of these media sources, and 0 if not
# This provides a more comprehensive measure of media access
female_df <- female_df %>%
  mutate(access_to_media = ifelse(used_computer_tablet == 1 | used_internet == 1 | owns_mobile_phone == 1, 1, 0))
# In later analysis, use access_to_media instead of the 3 separate variables
female_df <- female_df %>%
  dplyr::select(-used_computer_tablet, -used_internet, -owns_mobile_phone)
# Removing rows where 'age_first_marriage' is NA in female_df
# This is because these values represent people who are not yet married, which is not the focus of the study
married_df <- female_df[!is.na(female_df$age_first_marriage), ]
summary(married_df)
##  age_first_marriage marital_status       area           region     
##  Min.   :10.00      Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:18.00      1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000  
##  Median :20.00      Median :1.000   Median :2.000   Median :3.000  
##  Mean   :20.92      Mean   :1.063   Mean   :1.708   Mean   :3.377  
##  3rd Qu.:23.00      3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:5.000  
##  Max.   :47.00      Max.   :2.000   Max.   :2.000   Max.   :6.000  
##                                                                    
##  education_level health_insurance   ethnicity      wealth_index  
##  Min.   :0.00    Min.   :0.0000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.00    1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :2.00    Median :1.0000   Median :1.000   Median :2.000  
##  Mean   :2.29    Mean   :0.8597   Mean   :1.641   Mean   :2.541  
##  3rd Qu.:3.00    3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:4.000  
##  Max.   :5.00    Max.   :1.0000   Max.   :4.000   Max.   :5.000  
##  NA's   :405     NA's   :404      NA's   :980     NA's   :404    
##  current_contraceptive_use awareness_hiv_aids access_to_media 
##  Min.   :0.0000            Min.   :0.0000     Min.   :0.0000  
##  1st Qu.:0.0000            1st Qu.:1.0000     1st Qu.:1.0000  
##  Median :1.0000            Median :1.0000     Median :1.0000  
##  Mean   :0.7099            Mean   :0.7756     Mean   :0.8918  
##  3rd Qu.:1.0000            3rd Qu.:1.0000     3rd Qu.:1.0000  
##  Max.   :1.0000            Max.   :1.0000     Max.   :1.0000  
##  NA's   :744               NA's   :415        NA's   :408
# Exporting female_df to a CSV file in the current working directory
#write.csv(married_df, "married_df.csv", row.names = FALSE)

Distributions of Data

# Histogram for 'Age at First Marriage'
afm_hist <- ggplot(married_df, aes(x = age_first_marriage)) +
  geom_histogram(fill = "#F7C0C8", color = "black") +
  theme_minimal() +
  ggtitle("Histogram of Age at First Marriage") +
  xlab("Age at First Marriage") +
  ylab("Frequency")

print(afm_hist)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Handling Missing Data

# Before that, let's double check the count of missing values by column
count_missing <- sapply(married_df, function(x) sum(is.na(x)))
print(count_missing)
##        age_first_marriage            marital_status                      area 
##                         0                         0                         0 
##                    region           education_level          health_insurance 
##                         0                       405                       404 
##                 ethnicity              wealth_index current_contraceptive_use 
##                       980                       404                       744 
##        awareness_hiv_aids           access_to_media 
##                       415                       408
# Make a copy of female_df for imputation
imputed_df <- married_df
# Define a function to calculate mode for categorical variables
getMode <- function(v) {
  # The mode is the value that appears most frequently in the data
  uniqv <- unique(na.omit(v))  # Omit NA values and get unique values
  uniqv[which.max(tabulate(match(v, uniqv)))]  # Return the value with the highest frequency
}
# For 'ethnicity', an ordinal variable with predefined categories, it makes sense to impute missing values with the mode.
mode_ethnicity <- getMode(imputed_df$ethnicity)
imputed_df <- mutate(imputed_df, ethnicity = ifelse(is.na(ethnicity), mode_ethnicity, ethnicity))
mode_area <- getMode(imputed_df$area)
imputed_df <- mutate(imputed_df, area = ifelse(is.na(area), mode_area, area))
mode_region <- getMode(imputed_df$region)
imputed_df <- mutate(imputed_df, region = ifelse(is.na(region), mode_region, region))
mode_marital_status <- getMode(imputed_df$marital_status)
imputed_df <- mutate(imputed_df, marital_status = ifelse(is.na(marital_status), mode_marital_status, marital_status))


# Binary variables like 'current_contraceptive_use', 'health_insurance','awareness_hiv_aids', 'used_internet', 'used_computer_tablet', 'owns_mobile_phone', and 'access_to_media'
# should be imputed with the mode since it represents the most frequent category (either 0 or 1).

# Calculate the mode for each binary variable
mode_current_contraceptive_use <- getMode(imputed_df$current_contraceptive_use)
mode_health_insurance <- getMode(imputed_df$health_insurance)
mode_awareness_hiv_aids <- getMode(imputed_df$awareness_hiv_aids)
mode_access_to_media <- getMode(imputed_df$access_to_media)

# Impute missing values for binary variables
imputed_df <- mutate(imputed_df,
  current_contraceptive_use = ifelse(is.na(current_contraceptive_use), mode_current_contraceptive_use, current_contraceptive_use),
  health_insurance = ifelse(is.na(health_insurance), mode_health_insurance, health_insurance),
  awareness_hiv_aids = ifelse(is.na(awareness_hiv_aids), mode_awareness_hiv_aids, awareness_hiv_aids),
  access_to_media = ifelse(is.na(access_to_media), mode_access_to_media, access_to_media)
)

# 'education_level' is an ordinal variable where the median could be a more suitable measure of central tendency than the mode.
# However, given the categorical nature of the levels (e.g., "Primary", "Secondary"), using the mode may still be appropriate.
mode_education_level <- getMode(imputed_df$education_level)
imputed_df <- mutate(imputed_df, education_level = ifelse(is.na(education_level), mode_education_level, education_level))

# 'wealth_index' is an ordinal variable where the median could be a more suitable measure of central tendency than the mode.
# However, given the categorical nature of the levels (e.g., "Poorest", "Poor",...), using the mode may still be appropriate.
mode_wealth_index <- getMode(imputed_df$wealth_index)
imputed_df <- mutate(imputed_df, wealth_index = ifelse(is.na(wealth_index), mode_wealth_index, wealth_index))
# Check the resulting dataset to confirm changes
summary(imputed_df)
##  age_first_marriage marital_status       area           region     
##  Min.   :10.00      Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:18.00      1st Qu.:1.000   1st Qu.:1.000   1st Qu.:2.000  
##  Median :20.00      Median :1.000   Median :2.000   Median :3.000  
##  Mean   :20.92      Mean   :1.063   Mean   :1.708   Mean   :3.377  
##  3rd Qu.:23.00      3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:5.000  
##  Max.   :47.00      Max.   :2.000   Max.   :2.000   Max.   :6.000  
##  education_level health_insurance   ethnicity      wealth_index  
##  Min.   :0.000   Min.   :0.0000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :2.000   Median :1.0000   Median :1.000   Median :2.000  
##  Mean   :2.278   Mean   :0.8658   Mean   :1.574   Mean   :2.474  
##  3rd Qu.:3.000   3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:4.000  
##  Max.   :5.000   Max.   :1.0000   Max.   :4.000   Max.   :5.000  
##  current_contraceptive_use awareness_hiv_aids access_to_media 
##  Min.   :0.0000            Min.   :0.0000     Min.   :0.0000  
##  1st Qu.:0.0000            1st Qu.:1.0000     1st Qu.:1.0000  
##  Median :1.0000            Median :1.0000     Median :1.0000  
##  Mean   :0.7332            Mean   :0.7856     Mean   :0.8965  
##  3rd Qu.:1.0000            3rd Qu.:1.0000     3rd Qu.:1.0000  
##  Max.   :1.0000            Max.   :1.0000     Max.   :1.0000
# After imputation, let's check for missing values
count_imputation <- sapply(imputed_df, function(x) sum(is.na(x)))
print(count_imputation)
##        age_first_marriage            marital_status                      area 
##                         0                         0                         0 
##                    region           education_level          health_insurance 
##                         0                         0                         0 
##                 ethnicity              wealth_index current_contraceptive_use 
##                         0                         0                         0 
##        awareness_hiv_aids           access_to_media 
##                         0                         0

Visualization Comparing Before vs. After Imputation

# Histogram for 'ethnicity' before imputation
ethnic <- ggplot(married_df, aes(x = ethnicity)) + 
  geom_histogram(fill = "#F7C0C8", color = "black", bins = 30) +
  theme_light() +
  ggtitle("Before Imputation") +
  xlab("Ethnicity") +
  ylab("Frequency")

# Histogram for 'ethnicity' after imputation
ethnic_imputed <- ggplot(imputed_df, aes(x = ethnicity)) + 
  geom_histogram(fill = "#E83853", color = "black", bins = 30) +
  theme_light() +
  ggtitle("After Imputation") +
  xlab("Ethnicity") +
  ylab("Frequency")

# Arrange the two plots side by side
grid.arrange(ethnic, ethnic_imputed, ncol = 2)
## Warning: Removed 980 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Histogram for 'education_level' before imputation
educ <- ggplot(married_df, aes(x = education_level)) + 
  geom_histogram(fill = "#F7C0C8", color = "black", bins = 30) +
  theme_light() +
  ggtitle("Before Imputation") +
  xlab("Education Level") +
  ylab("Frequency")

# Histogram for 'education_level' after imputation
educ_imputed <- ggplot(imputed_df, aes(x = education_level)) + 
  geom_histogram(fill = "#E83853", color = "black", bins = 30) +
  theme_light() +
  ggtitle("After Imputation") +
  xlab("Education Level") +
  ylab("Frequency")

# Arrange the two plots side by side
grid.arrange(educ, educ_imputed, ncol = 2)
## Warning: Removed 405 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Arrange the two plots side by side and capture the layout as a grob
#combined_plots <- arrangeGrob(afm_0, afm_imputed, ncol = 2)

# Now, use ggsave to save the combined plot
#ggsave("combined_age_first_marriage.png", plot = combined_plots, width = 10, height = 5)

Creating Binary Variables for Child Marriage Under 18 and 16

# Convert "age at first marriage" into a binary variable to indicate child marriage
# Child marriage is defined as marriage before the age of 18
# The new binary variable "child_marriage" will have a value of 1 if the marriage occurred before age 18, and 0 otherwise
imputed_df <- imputed_df %>%
  mutate(child_marriage = ifelse(age_first_marriage < 18, 1, 0))
# Create a binary variable for child marriage under 16
# The new variable "child_marriage_u16" will have a value of 1 if the marriage occurred before age 16, and 0 otherwise
imputed_df <- imputed_df %>%
  mutate(child_marriage_u16 = ifelse(age_first_marriage < 16, 1, 0))
# Move "child_marriage" and "child_marriage_u16" to the front of the dataframe
imputed_df <- imputed_df %>%
  dplyr::select(child_marriage, child_marriage_u16, everything())
# Exporting female_df to a CSV file in the current working directory
#write.csv(imputed_df, "imputed_df.csv", row.names = FALSE)

EDA

Vietnam Map with Regions and Their Percentages of Child Marriage (under 18)

# Aggregate the data by region to get the total number of child marriages under 18 per region
region_counts <- aggregate(child_marriage ~ region, data = imputed_df, FUN = sum)

# Calculate the total number of child marriages under 18 in the dataset
total_child_marriages <- sum(region_counts$child_marriage)

# Calculate the percentage for each region
region_counts$married_u18_perc_of_total <- (region_counts$child_marriage / total_child_marriages) * 100
# Mapping region numbers to names
region_names <- c("Red River Delta", "Northern Midlands And Mountain", 
                  "North Central And Central Coastal", "Central Highlands", 
                  "South East", "Mekong River Delta")
names(region_counts)[1] <- "region_name"
region_counts$region_name <- factor(region_counts$region_name, levels = 1:6, labels = region_names)

# Display the final data frame
print(region_counts)
##                         region_name child_marriage married_u18_perc_of_total
## 1                   Red River Delta            142                  7.226463
## 2    Northern Midlands And Mountain            888                 45.190840
## 3 North Central And Central Coastal            215                 10.941476
## 4                 Central Highlands            271                 13.791349
## 5                        South East            194                  9.872774
## 6                Mekong River Delta            255                 12.977099
# Updated mapping including all provinces and cities in the Red River Delta
province_to_region <- data.frame(
  NAME_1 = c(
    'Bắc Ninh', 'Hà Nam', 'Hà Nội', 'Hải Dương', 'Hải Phòng', 'Hoà Bình', 'Hưng Yên', 'Nam Định', 'Ninh Bình', 'Thái Bình', 'Vĩnh Phúc', # Red River Delta 11
    
    'Bắc Giang', 'Bắc Kạn', 'Cao Bằng', 'Hà Giang', 'Lạng Sơn', 'Lào Cai', 'Phú Thọ', 'Quảng Ninh', 'Thái Nguyên', 'Tuyên Quang', 'Yên Bái', 'Điện Biên', 'Lai Châu', 'Sơn La', # Northern Midlands And Mountain 14
    
    'Bình Định', 'Bình Thuận', 'Khánh Hòa', 'Ninh Thuận', 'Phú Yên', 'Quảng Nam', 'Quảng Ngãi', 'Thừa Thiên Huế', 'Đà Nẵng', 'Hà Tĩnh', 'Nghệ An', 'Quảng Bình', 'Quảng Trị', 'Thanh Hóa', # North Central And Central Coastal 14
    
    'Đắk Lắk', 'Đắk Nông', 'Gia Lai', 'Kon Tum', 'Lâm Đồng', # Central Highlands 5
    
    'Bà Rịa - Vũng Tàu', 'Bình Dương', 'Bình Phước', 'Đồng Nai', 'Hồ Chí Minh', 'Tây Ninh', # South East 6
    
    'An Giang', 'Bạc Liêu', 'Bến Tre', 'Cà Mau', 'Cần Thơ', 'Đồng Tháp', 'Hậu Giang', 'Kiên Giang', 'Long An', 'Sóc Trăng', 'Tiền Giang', 'Trà Vinh', 'Vĩnh Long' # Mekong River Delta 13
  ),
  Region = c(
    rep('Red River Delta', 11), 
    rep('Northern Midlands And Mountain', 14), 
    rep('North Central And Central Coastal', 14), 
    rep('Central Highlands', 5), 
    rep('South East', 6), 
    rep('Mekong River Delta', 13)
  )
)
# Check the mapping
#print(province_to_region)
# Read shapefile
vietnam_shape <- st_read('/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/gadm41_VNM_shp')
## Multiple layers are present in data source /Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/gadm41_VNM_shp, reading layer `gadm41_VNM_2'.
## Use `st_layers' to list all layer names and their type in a data source.
## Set the `layer' argument in `st_read' to read a particular layer.
## Reading layer `gadm41_VNM_2' from data source 
##   `/Users/hollyduong/Desktop/DA 401/ChildMarriageInVietnam/gadm41_VNM_shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 710 features and 13 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 102.1446 ymin: 8.381355 xmax: 109.4692 ymax: 23.39269
## Geodetic CRS:  WGS 84
# Join the shapefile with the province-to-region mapping
vietnam_shape_with_region <- vietnam_shape %>%
  left_join(province_to_region, by = "NAME_1")

# Aggregate the shapefile data by region
vietnam_regions <- vietnam_shape_with_region %>%
  group_by(Region) %>%
  summarise(geometry = st_union(geometry), .groups = 'drop')
# Join the aggregated shapefile data with the child marriage data
vietnam_map_data <- vietnam_regions %>%
  left_join(region_counts, by = c("Region" = "region_name"))

# Define colors with your specific choices
colors_ordered <- setNames(c("#B20033", "#CD0A25", "#E83853", "#EF7D8D", "#F7C0C8", "#FBE1E5"),
                           c("Northern Midlands And Mountain", "Central Highlands", 
                             "Mekong River Delta", "North Central And Central Coastal", 
                             "South East", "Red River Delta"))

# Plot
mapvn <- ggplot(data = vietnam_map_data) +
  geom_sf(aes(fill = factor(Region, levels = names(colors_ordered))), color = NA) +
  geom_sf_text(aes(label = sprintf("%.1f%%", married_u18_perc_of_total)), size = 4, hjust = 0.5, vjust = 0.5) +
  scale_fill_manual(values = colors_ordered, name = "Region") +
  labs(title = "Regional Contribution of Female Child Marriage Rates to Total Rates in Vietnam") +
  theme_void() +
  theme(legend.position = "right")

print(mapvn)
## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data

#ggsave("mapvn.png", plot = mapvn, width = 8, height = 6, dpi = 300)

Marital Status Among Female Respondants by Region in Vietnam

total_child_marriages_u18 <- sum(imputed_df$child_marriage == 1)

# Aggregate counts for each category by region
counts_df <- aggregate(cbind(married_u18 = imputed_df$child_marriage == 1, 
                             married_u16 = imputed_df$child_marriage_u16 == 1) ~ region, 
                       data = imputed_df, 
                       FUN = sum)

# Convert counts to percentages based on total child marriages under 18
counts_df$married_u18 <- (counts_df$married_u18 / total_child_marriages_u18) * 100
counts_df$married_u16 <- (counts_df$married_u16 / total_child_marriages_u18) * 100
p <- ggplot(counts_df, aes(x = factor(region))) +
  geom_bar(aes(y = married_u18, fill = "Married before 18 years old"), stat = "identity", width = 0.5) +
  geom_bar(aes(y = married_u16, fill = "Married before 16 years old"), stat = "identity", width = 0.5) +
  
  # Adding text labels for married_u18
  geom_text(aes(y = married_u18, label = sprintf("%.1f%%", married_u18)), 
            position = position_stack(vjust = 1.03), 
            size = 4, color = "black") +
  # Adding text labels for married_u16
  geom_text(aes(y = married_u16, label = sprintf("%.1f%%", married_u16)), 
            position = position_stack(vjust = 1.05), 
            size = 4, color = "black") +
  
  scale_fill_manual(values = c("Married before 18 years old" = "#EF7D8D", 
                               "Married before 16 years old" = "#B20016"),
                    name = "Marital Status") +
  labs(x = "Region", y = "Percentage", title = "Marital Status Among Female Respondents by Region", element_text(size = 14)) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5, size = 18),
        plot.margin = margin(t = 10, r = 10, b = 10, l = 10, unit = "mm"), 
        axis.text.x = element_text(size = 14, angle = 45, hjust = 1), 
        legend.position = c(0.85,0.8),
        legend.text = element_text(size = 12),
        legend.title = element_text(size = 14)) +
  scale_x_discrete(labels = c("1" = "Red River Delta", "2" = "Northern Midlands And Mountain", 
                              "3" = "North Central And Central Coastal", "4" = "Central Highlands", 
                              "5" = "South East", "6" = "Mekong River Delta"))
## Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
## 3.5.0.
## ℹ Please use the `legend.position.inside` argument of `theme()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p)

#ggsave("marital_status.png", plot = p, width = 10, height = 5)

Preparing for Regression Analysis

Correlation Matrix

# Calculate correlation matrix
cor_matrix <- cor(imputed_df %>% select_if(is.numeric), use = "complete.obs")

# Melt the correlation matrix
melted_cor_matrix <- melt(cor_matrix)
# Generate an interactive heatmap
corr_matrix <- ggplot(melted_cor_matrix, aes(Var1, Var2, fill = value)) +
    geom_tile() +
    scale_fill_gradientn(
        colours = c("deepskyblue", "white", "#CD0A25"),
        values = scales::rescale(c(-1, 0, 1)),
        limits = c(-1, 1),
        name="Pearson\nCorrelation"
    ) +
    theme_minimal() + 
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    xlab("") + 
    ylab("") +
    ggtitle("Correlation Matrix") 

# Convert ggplot object to plotly for interactivity
ggplotly(corr_matrix)
#write.csv(imputed_df, "imputed_df.csv", row.names = FALSE)

Converting Categorical and Binary Variables to Factors

# Convert nominal and ordinal variables to factors
imputed_df$area <- as.factor(imputed_df$area)
imputed_df$region <- as.factor(imputed_df$region)
imputed_df$education_level <- factor(imputed_df$education_level, ordered = FALSE)
imputed_df$ethnicity <- as.factor(imputed_df$ethnicity)
imputed_df$wealth_index <- factor(imputed_df$wealth_index, ordered = FALSE)

# Binary variables are already in the correct format and can be used as is

Logistic Regression Analysis

Model 1 - Base Logistic Regression Model

# Baseline model for reference
baseline_model <- glm(child_marriage ~ area + education_level + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = imputed_df)

# Summarize the baseline model
summary(baseline_model)
## 
## Call:
## glm(formula = child_marriage ~ area + education_level + wealth_index + 
##     health_insurance + current_contraceptive_use + awareness_hiv_aids + 
##     access_to_media, family = binomial(), data = imputed_df)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -0.35056    0.13405  -2.615  0.00892 ** 
## area2                      0.40764    0.08124   5.018 5.22e-07 ***
## education_level1          -0.36537    0.08856  -4.126 3.70e-05 ***
## education_level2          -0.43403    0.08711  -4.983 6.27e-07 ***
## education_level3          -1.11556    0.11658  -9.569  < 2e-16 ***
## education_level4          -3.14435    0.51246  -6.136 8.47e-10 ***
## education_level5          -2.92930    0.25430 -11.519  < 2e-16 ***
## wealth_index2             -0.54988    0.08114  -6.777 1.23e-11 ***
## wealth_index3             -0.93401    0.10076  -9.270  < 2e-16 ***
## wealth_index4             -0.76067    0.11045  -6.887 5.70e-12 ***
## wealth_index5             -1.02057    0.15040  -6.786 1.16e-11 ***
## health_insurance           0.11657    0.08174   1.426  0.15384    
## current_contraceptive_use -0.03278    0.06298  -0.520  0.60280    
## awareness_hiv_aids        -0.45273    0.07058  -6.415 1.41e-10 ***
## access_to_media           -0.02742    0.08280  -0.331  0.74051    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9576.1  on 9267  degrees of freedom
## Residual deviance: 8000.9  on 9253  degrees of freedom
## AIC: 8030.9
## 
## Number of Fisher Scoring iterations: 7

Model 2 - Adding Fixed Effects to Base Model

# Enhanced model with additional fixed effects
enhanced_model <- glm(child_marriage ~ area + region + education_level + ethnicity + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, 
                      family = binomial(), 
                      data = imputed_df)

# Summarize the new model with FEs
summary(enhanced_model)
## 
## Call:
## glm(formula = child_marriage ~ area + region + education_level + 
##     ethnicity + wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media, family = binomial(), 
##     data = imputed_df)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.193525   0.182449  -6.542 6.08e-11 ***
## area2                      0.268252   0.085814   3.126  0.00177 ** 
## region2                    0.364102   0.130710   2.786  0.00534 ** 
## region3                    0.162343   0.129158   1.257  0.20878    
## region4                    0.331093   0.127873   2.589  0.00962 ** 
## region5                   -0.009249   0.127505  -0.073  0.94217    
## region6                   -0.041750   0.134893  -0.310  0.75694    
## education_level1           0.023045   0.096277   0.239  0.81083    
## education_level2          -0.031527   0.095643  -0.330  0.74167    
## education_level3          -0.739558   0.124044  -5.962 2.49e-09 ***
## education_level4          -2.815602   0.514641  -5.471 4.47e-08 ***
## education_level5          -2.597688   0.257538 -10.087  < 2e-16 ***
## ethnicity2                -0.004214   0.106644  -0.040  0.96848    
## ethnicity3                 0.284476   0.130154   2.186  0.02884 *  
## ethnicity4                 1.200228   0.109158  10.995  < 2e-16 ***
## wealth_index2             -0.230751   0.086382  -2.671  0.00756 ** 
## wealth_index3             -0.612377   0.106964  -5.725 1.03e-08 ***
## wealth_index4             -0.432981   0.118543  -3.653  0.00026 ***
## wealth_index5             -0.685336   0.162011  -4.230 2.33e-05 ***
## health_insurance           0.002808   0.083641   0.034  0.97321    
## current_contraceptive_use  0.031968   0.065069   0.491  0.62322    
## awareness_hiv_aids        -0.184659   0.077076  -2.396  0.01658 *  
## access_to_media           -0.081214   0.087354  -0.930  0.35252    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9576.1  on 9267  degrees of freedom
## Residual deviance: 7749.7  on 9245  degrees of freedom
## AIC: 7795.7
## 
## Number of Fisher Scoring iterations: 7

Model 3 - Model with Interaction Terms (Ethnicity:Access_to_media)

# Adding the interaction term between ethnicity and access_to_media
model_interaction <- update(enhanced_model, . ~ . + ethnicity:access_to_media)
summary(model_interaction)
## 
## Call:
## glm(formula = child_marriage ~ area + region + education_level + 
##     ethnicity + wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media + ethnicity:access_to_media, 
##     family = binomial(), data = imputed_df)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -0.844376   0.193886  -4.355 1.33e-05 ***
## area2                       0.247577   0.086219   2.871 0.004085 ** 
## region2                     0.348302   0.131139   2.656 0.007908 ** 
## region3                     0.124623   0.129817   0.960 0.337062    
## region4                     0.260669   0.129389   2.015 0.043944 *  
## region5                     0.001056   0.127629   0.008 0.993398    
## region6                    -0.035320   0.135245  -0.261 0.793971    
## education_level1            0.030019   0.096452   0.311 0.755622    
## education_level2           -0.016781   0.095796  -0.175 0.860945    
## education_level3           -0.722075   0.124203  -5.814 6.11e-09 ***
## education_level4           -2.797439   0.514965  -5.432 5.56e-08 ***
## education_level5           -2.584112   0.257668 -10.029  < 2e-16 ***
## ethnicity2                 -0.477278   0.277893  -1.717 0.085889 .  
## ethnicity3                 -0.793864   0.342986  -2.315 0.020637 *  
## ethnicity4                  0.637537   0.163791   3.892 9.93e-05 ***
## wealth_index2              -0.195630   0.087183  -2.244 0.024839 *  
## wealth_index3              -0.561983   0.107917  -5.208 1.91e-07 ***
## wealth_index4              -0.375970   0.119608  -3.143 0.001670 ** 
## wealth_index5              -0.628571   0.162928  -3.858 0.000114 ***
## health_insurance            0.005625   0.083896   0.067 0.946543    
## current_contraceptive_use   0.030599   0.065196   0.469 0.638829    
## awareness_hiv_aids         -0.156176   0.077668  -2.011 0.044345 *  
## access_to_media            -0.517207   0.120815  -4.281 1.86e-05 ***
## ethnicity2:access_to_media  0.578710   0.288945   2.003 0.045195 *  
## ethnicity3:access_to_media  1.216550   0.352880   3.447 0.000566 ***
## ethnicity4:access_to_media  0.792243   0.177283   4.469 7.87e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9576.1  on 9267  degrees of freedom
## Residual deviance: 7722.1  on 9242  degrees of freedom
## AIC: 7774.1
## 
## Number of Fisher Scoring iterations: 7

Logistic Regression Models Comparison (Odd Ratios and 95% Confidence Intervals)

# Function to add significance asterisks
add_asterisks <- function(p_value) {
  if (is.na(p_value)) {
    return(NA)
  } else if (p_value < 0.001) {
    return("***")
  } else if (p_value < 0.01) {
    return("**")
  } else if (p_value < 0.05) {
    return("*")
  } else {
    return("")
  }
}
# Function to format confidence intervals as a string
format_ci <- function(lower, upper) {
  paste0("(", round(lower, 2), ", ", round(upper, 2), ")")
}
# Tidy the baseline model with confidence intervals
tidy_baseline <- tidy(baseline_model, conf.int = TRUE, exponentiate = TRUE)

# Tidy the enhanced model with confidence intervals
tidy_enhanced <- tidy(enhanced_model, conf.int = TRUE, exponentiate = TRUE)

# Tidy the interaction model with confidence intervals
tidy_interaction <- tidy(model_interaction, conf.int = TRUE, exponentiate = TRUE)
# Apply the function to each model's p.value
tidy_baseline$asterisks <- sapply(tidy_baseline$p.value, add_asterisks)
tidy_enhanced$asterisks <- sapply(tidy_enhanced$p.value, add_asterisks)
tidy_interaction$asterisks <- sapply(tidy_interaction$p.value, add_asterisks)
# Create OR strings with asterisks and format CIs as a string
tidy_baseline <- tidy_baseline %>%
  mutate(
    OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
    CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
  )

tidy_enhanced <- tidy_enhanced %>%
  mutate(
    OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
    CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
  )

tidy_interaction <- tidy_interaction %>%
  mutate(
    OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
    CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
  )
# Add a 'Model' column to each tidied dataframe
tidy_baseline <- tidy_baseline %>% mutate(Model = "Model_1")
tidy_enhanced <- tidy_enhanced %>% mutate(Model = "Model_2")
tidy_interaction <- tidy_interaction %>% mutate(Model = "Model_3")

# Combine and pivot the dataframes
combined_results <- bind_rows(
  tidy_baseline %>% dplyr::select(term, OR, CI, Model),
  tidy_enhanced %>% dplyr::select(term, OR, CI, Model),
  tidy_interaction %>% dplyr::select(term, OR, CI, Model)
) %>%
  pivot_wider(names_from = Model, values_from = c(OR, CI))
# Replace NAs with "—"
combined_results[is.na(combined_results)] <- "—"
# Reordering columns to have OR and CI next to each other for each model
combined_results <- combined_results %>%
  dplyr::select(term, 
         OR_Model_1, CI_Model_1, 
         OR_Model_2, CI_Model_2, 
         OR_Model_3, CI_Model_3)
# Print the final combined table
print(combined_results)
## # A tibble: 26 × 7
##    term        OR_Model_1 CI_Model_1 OR_Model_2 CI_Model_2 OR_Model_3 CI_Model_3
##    <chr>       <chr>      <chr>      <chr>      <chr>      <chr>      <chr>     
##  1 (Intercept) 0.7**      (0.54, 0.… 0.3***     (0.21, 0.… 0.43***    (0.29, 0.…
##  2 area2       1.5***     (1.28, 1.… 1.31**     (1.11, 1.… 1.28**     (1.08, 1.…
##  3 education_… 0.69***    (0.58, 0.… 1.02       (0.85, 1.… 1.03       (0.85, 1.…
##  4 education_… 0.65***    (0.55, 0.… 0.97       (0.8, 1.1… 0.98       (0.82, 1.…
##  5 education_… 0.33***    (0.26, 0.… 0.48***    (0.37, 0.… 0.49***    (0.38, 0.…
##  6 education_… 0.04***    (0.01, 0.… 0.06***    (0.02, 0.… 0.06***    (0.02, 0.…
##  7 education_… 0.05***    (0.03, 0.… 0.07***    (0.04, 0.… 0.08***    (0.04, 0.…
##  8 wealth_ind… 0.58***    (0.49, 0.… 0.79**     (0.67, 0.… 0.82*      (0.69, 0.…
##  9 wealth_ind… 0.39***    (0.32, 0.… 0.54***    (0.44, 0.… 0.57***    (0.46, 0.…
## 10 wealth_ind… 0.47***    (0.38, 0.… 0.65***    (0.51, 0.… 0.69**     (0.54, 0.…
## # ℹ 16 more rows

Models Validation

Likelihood Ratio Test (LRT)

# Model 1 vs. Model 2
anova(baseline_model, enhanced_model, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: child_marriage ~ area + education_level + wealth_index + health_insurance + 
##     current_contraceptive_use + awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity + 
##     wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      9253     8000.9                          
## 2      9245     7749.7  8   251.12 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The Model 2 provides a significantly better fit to the data compared to the Model 1, as indicated by the large decrease in residual deviance and the very small p-value (< 2.2e-16).

# Model 1 vs. Model 3
anova(baseline_model, model_interaction, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: child_marriage ~ area + education_level + wealth_index + health_insurance + 
##     current_contraceptive_use + awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity + 
##     wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media + ethnicity:access_to_media
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      9253     8000.9                          
## 2      9242     7722.1 11   278.72 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The Model 3 provides a significantly better fit to the data compared to the Model 1, as indicated by the large decrease in residual deviance and the very small p-value (< 2.2e-16).

# Model 2 vs. Model 3
anova(enhanced_model, model_interaction, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: child_marriage ~ area + region + education_level + ethnicity + 
##     wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity + 
##     wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media + ethnicity:access_to_media
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      9245     7749.7                          
## 2      9242     7722.1  3   27.602 4.401e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value is extremely small (4.401e-06), far below the common significance level of 0.05. This means the difference in fit between the two models is statistically significant.

Adding the interaction terms between ethnicity and access_to_media to Model 3 improved the model fit compared to the Model 2 without interaction terms. This indicates that the relationship between ethnicity and child marriage is significantly affected by access to media.

Based on this analysis, Model 2 would be preferred over Model 1 for understanding or predicting child marriage, as it accounts for the interaction effect that is significant in your data.

ROC Curve and AUC

invisible(plot(roc(imputed_df$child_marriage,
                   fitted(baseline_model)),
               col = "red",
               main = "ROC Curve: \nModel 1 (red) vs. Model 2 (green) vs. Model 3 (blue)"))

invisible(plot(roc(imputed_df$child_marriage,
                   fitted(enhanced_model)),
               col = "green",
               add = T))

invisible(plot(roc(imputed_df$child_marriage,
                   fitted(model_interaction)),
               print.auc = T,
               col = "blue",
               add = T))

# For each model
roc_response_model_1 <- roc(imputed_df$child_marriage, fitted(baseline_model))
auc_model_1 <- auc(roc_response_model_1)

roc_response_model_2 <- roc(imputed_df$child_marriage, fitted(enhanced_model))
auc_model_2 <- auc(roc_response_model_2)

roc_response_model_3 <- roc(imputed_df$child_marriage, fitted(model_interaction))
auc_model_3 <- auc(roc_response_model_3)

# Compare AUC values
print(paste("AUC Model 1:", auc_model_1))
## [1] "AUC Model 1: 0.772320518006648"
print(paste("AUC Model 2:", auc_model_2))
## [1] "AUC Model 2: 0.786801269233356"
print(paste("AUC Model 3:", auc_model_3))
## [1] "AUC Model 3: 0.789551716172274"
  1. Model 1 (AUC: 0.7723):

An AUC of 0.5 represents a model with no discriminative ability (akin to random guessing), while an AUC of 1.0 represents perfect discrimination. So, my model is performing substantially better than random guessing.

The AUC of 0.7 indicates that the Model 1 has good discriminative ability. In other words, it is capable of distinguishing between cases and controls with a high degree of accuracy.

  1. Model 2 (AUC: 0.7868):

This model shows a slight improvement in AUC over the Model 1. The increase suggests that the additional variables (or adjustments) I made in the Model 2 contribute positively to its ability to differentiate between cases and controls. The difference in AUC between the Model 1 and Model 2, while modest, is still meaningful, especially in practical, real-world contexts.

  1. Model 3 (AUC: 0.7896):

The Model 3 shows a very slight improvement in AUC over the Model 2. This indicates that adding interaction terms provides a marginal improvement in the model’s discriminatory power. However, the improvement is very minimal, which aligns with my earlier findings that the interaction terms did not significantly improve the model fit.

  1. Overall:

All models demonstrate good ability to distinguish between cases and controls. An AUC greater than 0.7 is generally considered acceptable, and my models are above 0.7 and close to 0.8. The Model 2 and Model 3 only show marginal improvements in AUC compared to the Model 1. This suggests that while the additional complexity (more variables and interaction terms) does contribute slightly to model performance, the gains are not substantial.

Residuals Plot

binnedplot(fitted(model_interaction),
           residuals(model_interaction, type = "response"),
           nclass = NULL,
           xlab = "Expected Values",
           ylab = "Average Residuals",
           main = "Binned Residual Plot",
           cex.pts = 1,
           col.int = "gray")

Hosmer-Lemeshow Test

# 1. Hosmer-Lemeshow Test for the Model 1
hoslem.test(baseline_model$y, fitted(baseline_model), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  baseline_model$y, fitted(baseline_model)
## X-squared = 12.943, df = 8, p-value = 0.1138
# 2. Hosmer-Lemeshow Test for the Model 2 (Baseline + Fixed Effects)
hoslem.test(enhanced_model$y, fitted(enhanced_model), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  enhanced_model$y, fitted(enhanced_model)
## X-squared = 5.3687, df = 8, p-value = 0.7175
# 3. Hosmer-Lemeshow Test for the Model 3 (Baseline + Fixed Effects + Interaction Terms)
hoslem.test(model_interaction$y, fitted(model_interaction), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  model_interaction$y, fitted(model_interaction)
## X-squared = 5.9739, df = 8, p-value = 0.6502

A large p-value (>0.05) indicates a good fit, meaning that there’s no significant difference between the observed and predicted values. Through each model, the p-value increases which suggests that our decision to include fixed effects and interaction terms are significant.

Accessing Multicollinearity (VIF)

# VIFs check (A VIF value > 5 indicates high multicollinearity)
# Base model
model_1_vif <- vif(baseline_model)
print(model_1_vif)
##                               GVIF Df GVIF^(1/(2*Df))
## area                      1.116394  1        1.056596
## education_level           1.673060  5        1.052813
## wealth_index              1.418920  4        1.044708
## health_insurance          1.018544  1        1.009229
## current_contraceptive_use 1.015178  1        1.007560
## awareness_hiv_aids        1.482419  1        1.217546
## access_to_media           1.263256  1        1.123947
# Enhanced model
model_2_vif <- vif(enhanced_model)
print(model_2_vif)
##                               GVIF Df GVIF^(1/(2*Df))
## area                      1.250833  1        1.118406
## region                    4.773168  5        1.169178
## education_level           1.957540  5        1.069476
## ethnicity                 4.140370  3        1.267185
## wealth_index              1.953237  4        1.087287
## health_insurance          1.049221  1        1.024315
## current_contraceptive_use 1.022997  1        1.011433
## awareness_hiv_aids        1.649486  1        1.284323
## access_to_media           1.310264  1        1.144668
# Interacton model
model_3_vif <- vif(model_interaction)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
print(model_3_vif)
##                                 GVIF Df GVIF^(1/(2*Df))
## area                        1.261333  1        1.123091
## region                      5.010354  5        1.174862
## education_level             1.973402  5        1.070340
## ethnicity                 504.179961  3        2.821181
## wealth_index                2.017338  4        1.091685
## health_insurance            1.050125  1        1.024756
## current_contraceptive_use   1.024107  1        1.011982
## awareness_hiv_aids          1.672349  1        1.293193
## access_to_media             2.594899  1        1.610869
## ethnicity:access_to_media 422.759211  3        2.739569
  • Model 1 has the least concern with multicollinearity.
  • Model 2 introduces region and ethnicity, with a mild increase in multicollinearity, but not at alarming levels.
  • Model 3 exhibits more noticeable multicollinearity, particularly with the ethnicity variable and its interaction with access_to_media; their VIF values are significantly higher (around 2.82 and 2.74, respectively). Although these values are below 5, the increase is substantial compared to the previous models and could start to affect the reliability and interpretability of the regression coefficients for these variables.
  • High multicollinearity in models, especially those with interaction terms, doesn’t invalidate the model but does complicate the interpretation of specific coefficients.

Assessing AIC and BIC

# Create a data frame to hold AIC and BIC values
aic_bic_comparison <- data.frame(
    Model = c("Model 1", "Model 2", "Model 3"),
    AIC = c(AIC(baseline_model), AIC(enhanced_model), AIC(model_interaction)),
    BIC = c(BIC(baseline_model), BIC(enhanced_model), BIC(model_interaction))
)

# Print the table
print(aic_bic_comparison)
##     Model      AIC      BIC
## 1 Model 1 8030.853 8137.868
## 2 Model 2 7795.737 7959.826
## 3 Model 3 7774.134 7959.627

Model 3 has the lowest AIC value of all, indicating that it might be the best model among the three in terms of balancing fit and complexity according to AIC. However, its BIC is very slightly lower than that of Model 2, making it almost equivalent to Model 2 in terms of BIC. This small difference in BIC suggests that Model 3 and Model 2 have similar levels of complexity-adjusted fit.

Conservative Approach: Redo Analysis with Actual Data (No Imputation)

Data Prep

Recreate DataFrames with Removed NAs

# No imputation -- Remove rows with any missing data from 'female_df'
og_df <- na.omit(married_df)
print(og_df)
## # A tibble: 7,591 × 11
##    age_first_marriage marital_status  area region education_level
##                 <dbl>          <dbl> <dbl>  <dbl>           <dbl>
##  1                 23              1     1      1               5
##  2                 17              1     1      1               2
##  3                 23              1     1      1               5
##  4                 22              1     1      1               5
##  5                 26              1     1      1               5
##  6                 23              1     1      1               5
##  7                 27              1     1      1               4
##  8                 25              1     1      1               5
##  9                 21              1     1      1               3
## 10                 25              1     1      1               5
## # ℹ 7,581 more rows
## # ℹ 6 more variables: health_insurance <dbl>, ethnicity <dbl>,
## #   wealth_index <dbl>, current_contraceptive_use <dbl>,
## #   awareness_hiv_aids <dbl>, access_to_media <dbl>

Recreate Necessary Variables

# Convert "age at first marriage" into a binary variable to indicate child marriage
# Child marriage is defined as marriage before the age of 18
# The new binary variable "child_marriage" will have a value of 1 if the marriage occurred before age 18, and 0 otherwise
og_df <- og_df %>%
  mutate(child_marriage = ifelse(age_first_marriage < 18, 1, 0))
# Create a binary variable for child marriage under 16
# The new variable "child_marriage_u16" will have a value of 1 if the marriage occurred before age 16, and 0 otherwise
og_df <- og_df %>%
  mutate(child_marriage_u16 = ifelse(age_first_marriage < 16, 1, 0))
# Move "child_marriage" and "child_marriage_u16" to the front of the dataframe
og_df <- og_df %>%
  dplyr::select(child_marriage, child_marriage_u16, everything())

Converting Categorical and Binary Variables to Factors

# Convert nominal and ordinal variables to factors
og_df$area <- as.factor(og_df$area)
og_df$region <- as.factor(og_df$region)
og_df$education_level <- factor(og_df$education_level, ordered = FALSE)
og_df$ethnicity <- as.factor(og_df$ethnicity)
og_df$wealth_index <- factor(og_df$wealth_index, ordered = FALSE)

# Binary variables are already in the correct format and can be used as is
#write.csv(og_df, "og_df.csv", row.names = FALSE)

Recreate Logistic Regression Models Without Imputed Data

Model 1: Base Model

# Base model
base_model_no_impute <- glm(child_marriage ~ area + education_level + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = og_df)

# Summarize the base model
summary(base_model_no_impute)
## 
## Call:
## glm(formula = child_marriage ~ area + education_level + wealth_index + 
##     health_insurance + current_contraceptive_use + awareness_hiv_aids + 
##     access_to_media, family = binomial(), data = og_df)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -0.12285    0.15692  -0.783  0.43370    
## area2                      0.26196    0.09295   2.818  0.00483 ** 
## education_level1          -0.43948    0.10272  -4.278 1.88e-05 ***
## education_level2          -0.51284    0.10331  -4.964 6.90e-07 ***
## education_level3          -1.27682    0.13682  -9.332  < 2e-16 ***
## education_level4          -3.38049    0.59104  -5.720 1.07e-08 ***
## education_level5          -3.10583    0.27962 -11.107  < 2e-16 ***
## wealth_index2             -0.61689    0.09276  -6.651 2.92e-11 ***
## wealth_index3             -0.95525    0.11163  -8.558  < 2e-16 ***
## wealth_index4             -0.75531    0.12062  -6.262 3.80e-10 ***
## wealth_index5             -1.01614    0.15960  -6.367 1.93e-10 ***
## health_insurance           0.09667    0.09195   1.051  0.29308    
## current_contraceptive_use -0.08254    0.06980  -1.182  0.23704    
## awareness_hiv_aids        -0.47372    0.08239  -5.750 8.94e-09 ***
## access_to_media            0.01077    0.10367   0.104  0.91726    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7404.1  on 7590  degrees of freedom
## Residual deviance: 6086.8  on 7576  degrees of freedom
## AIC: 6116.8
## 
## Number of Fisher Scoring iterations: 7

Model 2: Base Model with Fixed Effects

# Enhanced model with additional fixed effects
enhanced_model_no_impute <- glm(child_marriage ~ area + region + education_level + ethnicity + wealth_index + health_insurance + current_contraceptive_use + awareness_hiv_aids + access_to_media, family = binomial(), data = og_df)

# Summarize the new model with FEs
summary(enhanced_model_no_impute)
## 
## Call:
## glm(formula = child_marriage ~ area + region + education_level + 
##     ethnicity + wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media, family = binomial(), 
##     data = og_df)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               -1.6510890  0.2290549  -7.208 5.67e-13 ***
## area2                      0.1470706  0.0980382   1.500  0.13358    
## region2                    0.0495982  0.1566174   0.317  0.75148    
## region3                   -0.0227665  0.1516838  -0.150  0.88069    
## region4                   -0.1639096  0.1668163  -0.983  0.32582    
## region5                   -0.0308200  0.1398462  -0.220  0.82557    
## region6                    0.0503335  0.1461564   0.344  0.73056    
## education_level1           0.1358036  0.1169557   1.161  0.24558    
## education_level2           0.0826556  0.1189010   0.695  0.48695    
## education_level3          -0.7178808  0.1495591  -4.800 1.59e-06 ***
## education_level4          -2.8361281  0.5948134  -4.768 1.86e-06 ***
## education_level5          -2.5784165  0.2856381  -9.027  < 2e-16 ***
## ethnicity2                 0.5559934  0.1376108   4.040 5.34e-05 ***
## ethnicity3                 0.3371900  0.1396231   2.415  0.01574 *  
## ethnicity4                 1.9238662  0.1608494  11.961  < 2e-16 ***
## wealth_index2             -0.0532348  0.1077890  -0.494  0.62139    
## wealth_index3             -0.3666087  0.1293455  -2.834  0.00459 ** 
## wealth_index4             -0.1573256  0.1414044  -1.113  0.26588    
## wealth_index5             -0.4268584  0.1831853  -2.330  0.01980 *  
## health_insurance          -0.0958692  0.0954439  -1.004  0.31516    
## current_contraceptive_use  0.0008332  0.0728182   0.011  0.99087    
## awareness_hiv_aids        -0.0914918  0.0944656  -0.969  0.33278    
## access_to_media            0.1818904  0.1103596   1.648  0.09932 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7404.1  on 7590  degrees of freedom
## Residual deviance: 5849.7  on 7568  degrees of freedom
## AIC: 5895.7
## 
## Number of Fisher Scoring iterations: 7

Model 3: Base Model with Fixed Effects and Interaction Terms

# Adding the interaction term between education level and wealth index
interaction_model_no_impute <- update(enhanced_model_no_impute, . ~ . + ethnicity:access_to_media)
summary(interaction_model_no_impute)
## 
## Call:
## glm(formula = child_marriage ~ area + region + education_level + 
##     ethnicity + wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media + ethnicity:access_to_media, 
##     family = binomial(), data = og_df)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -1.320135   0.314012  -4.204 2.62e-05 ***
## area2                       0.142438   0.098197   1.451  0.14691    
## region2                     0.055723   0.157017   0.355  0.72268    
## region3                    -0.014421   0.151753  -0.095  0.92429    
## region4                    -0.159251   0.166939  -0.954  0.34011    
## region5                    -0.022088   0.140015  -0.158  0.87465    
## region6                     0.048868   0.146347   0.334  0.73844    
## education_level1            0.128960   0.117054   1.102  0.27059    
## education_level2            0.079831   0.118844   0.672  0.50176    
## education_level3           -0.718660   0.149583  -4.804 1.55e-06 ***
## education_level4           -2.834696   0.594879  -4.765 1.89e-06 ***
## education_level5           -2.580240   0.285640  -9.033  < 2e-16 ***
## ethnicity2                  0.415596   0.369280   1.125  0.26041    
## ethnicity3                 -0.321475   0.406418  -0.791  0.42895    
## ethnicity4                  1.542567   0.301560   5.115 3.13e-07 ***
## wealth_index2              -0.053570   0.108105  -0.496  0.62022    
## wealth_index3              -0.360333   0.129734  -2.777  0.00548 ** 
## wealth_index4              -0.146634   0.141931  -1.033  0.30154    
## wealth_index5              -0.413285   0.183715  -2.250  0.02447 *  
## health_insurance           -0.097032   0.095522  -1.016  0.30972    
## current_contraceptive_use  -0.002748   0.072895  -0.038  0.96993    
## awareness_hiv_aids         -0.083608   0.094607  -0.884  0.37684    
## access_to_media            -0.166428   0.253988  -0.655  0.51230    
## ethnicity2:access_to_media  0.135750   0.367471   0.369  0.71182    
## ethnicity3:access_to_media  0.725684   0.420694   1.725  0.08453 .  
## ethnicity4:access_to_media  0.429261   0.289543   1.483  0.13820    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7404.1  on 7590  degrees of freedom
## Residual deviance: 5845.7  on 7565  degrees of freedom
## AIC: 5897.7
## 
## Number of Fisher Scoring iterations: 7

Models With vs Without Imputed Data

# Extracting data from the models
extract_model_data <- function(model) {
  model_summary <- summary(model)
  coeffs <- model_summary$coefficients
  data.frame(
    Term = rownames(coeffs),
    Estimate = sprintf("%.3f", coeffs[, "Estimate"]),
    pValue = ifelse(coeffs[, "Pr(>|z|)"] < 0.001, 
                    format(coeffs[, "Pr(>|z|)"], scientific = TRUE),
                    sprintf("%.3f", coeffs[, "Pr(>|z|)"])),
    Significance = sapply(coeffs[, "Pr(>|z|)"], add_asterisks)
  )
}

Model 1: Base Model

# Applying the function to each model
base_impute_data <- extract_model_data(baseline_model)
base_no_impute_data <- extract_model_data(base_model_no_impute)

# Combine the data for comparison
base_comparison_data <- merge(base_impute_data, base_no_impute_data, by = "Term", suffixes = c("_Impute", "_NoImpute"), sort = FALSE)

# View the comparison
print(base_comparison_data)
##                         Term Estimate_Impute pValue_Impute Significance_Impute
## 1                (Intercept)          -0.351         0.009                  **
## 2                      area2           0.408  5.222876e-07                 ***
## 3           education_level1          -0.365  3.696088e-05                 ***
## 4           education_level2          -0.434  6.270603e-07                 ***
## 5           education_level3          -1.116  1.076550e-21                 ***
## 6           education_level4          -3.144  8.474392e-10                 ***
## 7           education_level5          -2.929  1.060085e-30                 ***
## 8              wealth_index2          -0.550  1.230578e-11                 ***
## 9              wealth_index3          -0.934  1.867146e-20                 ***
## 10             wealth_index4          -0.761  5.702313e-12                 ***
## 11             wealth_index5          -1.021  1.156489e-11                 ***
## 12          health_insurance           0.117         0.154                    
## 13 current_contraceptive_use          -0.033         0.603                    
## 14        awareness_hiv_aids          -0.453  1.412117e-10                 ***
## 15           access_to_media          -0.027         0.741                    
##    Estimate_NoImpute pValue_NoImpute Significance_NoImpute
## 1             -0.123           0.434                      
## 2              0.262           0.005                    **
## 3             -0.439    1.881675e-05                   ***
## 4             -0.513    6.901996e-07                   ***
## 5             -1.277    1.036155e-20                   ***
## 6             -3.380    1.067977e-08                   ***
## 7             -3.106    1.155012e-28                   ***
## 8             -0.617    2.917388e-11                   ***
## 9             -0.955    1.153337e-17                   ***
## 10            -0.755    3.797438e-10                   ***
## 11            -1.016    1.931436e-10                   ***
## 12             0.097           0.293                      
## 13            -0.083           0.237                      
## 14            -0.474    8.940563e-09                   ***
## 15             0.011           0.917
  • Most variables show consistent results in terms of significance and direction of effect across both models. This suggests that the imputation process has not drastically altered the relationships between these predictors and the outcome variable.
  • Education and Wealth Index variables are consistent, robust and significant predictors of the outcome across both models.
  • Significance of Area: This variable’s significance suggests geographical variation is an important factor, but its influence is reduced in the non-imputed model.
  • The overall patterns suggest that while some predictors are consistently influential, others are sensitive to how missing data is handled.

Model 2: Base Model with Fixed Effects

# Applying the function to each model
enhanced_impute_data <- extract_model_data(enhanced_model)
enhanced_no_impute_data <- extract_model_data(enhanced_model_no_impute)

# Combine the data for comparison
enhanced_comparison_data <- merge(enhanced_impute_data, enhanced_no_impute_data, by = "Term", suffixes = c("_Impute", "_NoImpute"), sort = FALSE)

# View the comparison
print(enhanced_comparison_data)
##                         Term Estimate_Impute pValue_Impute Significance_Impute
## 1                (Intercept)          -1.194  6.082446e-11                 ***
## 2                      area2           0.268         0.002                  **
## 3                    region2           0.364         0.005                  **
## 4                    region3           0.162         0.209                    
## 5                    region4           0.331         0.010                  **
## 6                    region5          -0.009         0.942                    
## 7                    region6          -0.042         0.757                    
## 8           education_level1           0.023         0.811                    
## 9           education_level2          -0.032         0.742                    
## 10          education_level3          -0.740  2.491054e-09                 ***
## 11          education_level4          -2.816  4.474974e-08                 ***
## 12          education_level5          -2.598  6.330343e-24                 ***
## 13                ethnicity2          -0.004         0.968                    
## 14                ethnicity3           0.284         0.029                   *
## 15                ethnicity4           1.200  4.023378e-28                 ***
## 16             wealth_index2          -0.231         0.008                  **
## 17             wealth_index3          -0.612  1.033996e-08                 ***
## 18             wealth_index4          -0.433  2.596704e-04                 ***
## 19             wealth_index5          -0.685  2.334946e-05                 ***
## 20          health_insurance           0.003         0.973                    
## 21 current_contraceptive_use           0.032         0.623                    
## 22        awareness_hiv_aids          -0.185         0.017                   *
## 23           access_to_media          -0.081         0.353                    
##    Estimate_NoImpute pValue_NoImpute Significance_NoImpute
## 1             -1.651    5.666765e-13                   ***
## 2              0.147           0.134                      
## 3              0.050           0.751                      
## 4             -0.023           0.881                      
## 5             -0.164           0.326                      
## 6             -0.031           0.826                      
## 7              0.050           0.731                      
## 8              0.136           0.246                      
## 9              0.083           0.487                      
## 10            -0.718    1.586814e-06                   ***
## 11            -2.836    1.859743e-06                   ***
## 12            -2.578    1.766623e-19                   ***
## 13             0.556    5.337557e-05                   ***
## 14             0.337           0.016                     *
## 15             1.924    5.710335e-33                   ***
## 16            -0.053           0.621                      
## 17            -0.367           0.005                    **
## 18            -0.157           0.266                      
## 19            -0.427           0.020                     *
## 20            -0.096           0.315                      
## 21             0.001           0.991                      
## 22            -0.091           0.333                      
## 23             0.182           0.099
  • Consistent findings across the imputed and non-imputed models in certain variables (e.g., education levels 3, 4, 5, and ethnicity4) suggest that imputation does not radically alter these relationships. This consistency can indicate that imputation is not introducing significant bias for these variables.
  • Imputation might reveal statistically significant relationships that are not evident in the non-imputed data (e.g., area2, region2, region4, awareness of HIV/AIDS).
  • For some variables (e.g., wealth_index categories), the significance and estimate values change between the models.
  • The loss of significance in many variables in the non-imputed model suggests that the missing data may not be random and could be related to the variables’ influence on the outcome.

Model 3: Base Model with Fixed Effects and Interaction Terms

# Applying the function to each model
interaction_impute_data <- extract_model_data(model_interaction)
interaction_no_impute_data <- extract_model_data(interaction_model_no_impute)

# Combine the data for comparison
interaction_comparison_data <- merge(interaction_impute_data, interaction_no_impute_data, by = "Term", suffixes = c("_Impute", "_NoImpute"), sort = FALSE)

# View the comparison
print(interaction_comparison_data)
##                          Term Estimate_Impute pValue_Impute Significance_Impute
## 1                 (Intercept)          -0.844  1.330622e-05                 ***
## 2                       area2           0.248         0.004                  **
## 3                     region2           0.348         0.008                  **
## 4                     region3           0.125         0.337                    
## 5                     region4           0.261         0.044                   *
## 6                     region5           0.001         0.993                    
## 7                     region6          -0.035         0.794                    
## 8            education_level1           0.030         0.756                    
## 9            education_level2          -0.017         0.861                    
## 10           education_level3          -0.722  6.111688e-09                 ***
## 11           education_level4          -2.797  5.563680e-08                 ***
## 12           education_level5          -2.584  1.138308e-23                 ***
## 13                 ethnicity2          -0.477         0.086                    
## 14                 ethnicity3          -0.794         0.021                   *
## 15                 ethnicity4           0.638  9.926885e-05                 ***
## 16              wealth_index2          -0.196         0.025                   *
## 17              wealth_index3          -0.562  1.913471e-07                 ***
## 18              wealth_index4          -0.376         0.002                  **
## 19              wealth_index5          -0.629  1.143325e-04                 ***
## 20           health_insurance           0.006         0.947                    
## 21  current_contraceptive_use           0.031         0.639                    
## 22         awareness_hiv_aids          -0.156         0.044                   *
## 23            access_to_media          -0.517  1.860773e-05                 ***
## 24 ethnicity2:access_to_media           0.579         0.045                   *
## 25 ethnicity3:access_to_media           1.217  5.658165e-04                 ***
## 26 ethnicity4:access_to_media           0.792  7.865819e-06                 ***
##    Estimate_NoImpute pValue_NoImpute Significance_NoImpute
## 1             -1.320    2.621321e-05                   ***
## 2              0.142           0.147                      
## 3              0.056           0.723                      
## 4             -0.014           0.924                      
## 5             -0.159           0.340                      
## 6             -0.022           0.875                      
## 7              0.049           0.738                      
## 8              0.129           0.271                      
## 9              0.080           0.502                      
## 10            -0.719    1.552054e-06                   ***
## 11            -2.835    1.887040e-06                   ***
## 12            -2.580    1.667393e-19                   ***
## 13             0.416           0.260                      
## 14            -0.321           0.429                      
## 15             1.543    3.132494e-07                   ***
## 16            -0.054           0.620                      
## 17            -0.360           0.005                    **
## 18            -0.147           0.302                      
## 19            -0.413           0.024                     *
## 20            -0.097           0.310                      
## 21            -0.003           0.970                      
## 22            -0.084           0.377                      
## 23            -0.166           0.512                      
## 24             0.136           0.712                      
## 25             0.726           0.085                      
## 26             0.429           0.138
  • Consistency in the significance of variables like education levels 3, 4, and 5, and ethnicity4 across both models suggests that imputation is not drastically changing these relationships.
  • The imputed model shows significant effects in variables (like region2, region4, ethnicity3, and interaction terms) that are not significant in the non-imputed model.
  • The differing levels of significance for variables like wealth index categories and awareness of HIV/AIDS between models raise questions about how missing data impacts these variables.
  • The loss of significance in many variables in the non-imputed model suggests that the missing data may not be random and could be related to the variables’ influence on the outcome.

Logistic Regression Models Comparison (Odd Ratios and 95% Confidence Intervals)

# Tidy the baseline model with confidence intervals
tidy_base_no_impute <- tidy(base_model_no_impute, conf.int = TRUE, exponentiate = TRUE)

# Tidy the enhanced model with confidence intervals
tidy_enhanced_no_impute <- tidy(enhanced_model_no_impute, conf.int = TRUE, exponentiate = TRUE)

# Tidy the interaction model with confidence intervals
tidy_interaction_no_impute <- tidy(interaction_model_no_impute, conf.int = TRUE, exponentiate = TRUE)
# Apply the function to each model's p.value
tidy_base_no_impute$asterisks <- sapply(tidy_base_no_impute$p.value, add_asterisks)
tidy_enhanced_no_impute$asterisks <- sapply(tidy_enhanced_no_impute$p.value, add_asterisks)
tidy_interaction_no_impute$asterisks <- sapply(tidy_interaction_no_impute$p.value, add_asterisks)
# Create OR strings with asterisks and format CIs as a string
tidy_base_no_impute <- tidy_base_no_impute %>%
  mutate(
    OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
    CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
  )

tidy_enhanced_no_impute <- tidy_enhanced_no_impute %>%
  mutate(
    OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
    CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
  )

tidy_interaction_no_impute <- tidy_interaction_no_impute %>%
  mutate(
    OR = ifelse(is.na(estimate), NA, paste0(round(estimate, 2), asterisks)),
    CI = ifelse(is.na(conf.low) | is.na(conf.high), NA, format_ci(conf.low, conf.high))
  )
# Add a 'Model' column to each tidied dataframe
tidy_base_no_impute <- tidy_base_no_impute %>% mutate(Model = "Model_1")
tidy_enhanced_no_impute <- tidy_enhanced_no_impute %>% mutate(Model = "Model_2")
tidy_interaction_no_impute <- tidy_interaction_no_impute %>% mutate(Model = "Model_3")

# Combine and pivot the dataframes
combined_no_impute <- bind_rows(
  tidy_base_no_impute %>% dplyr::select(term, OR, CI, Model),
  tidy_enhanced_no_impute %>% dplyr::select(term, OR, CI, Model),
  tidy_interaction_no_impute %>% dplyr::select(term, OR, CI, Model)
) %>%
  pivot_wider(names_from = Model, values_from = c(OR, CI))
# Replace NAs with "—"
combined_no_impute[is.na(combined_no_impute)] <- "—"
# Reordering columns to have OR and CI next to each other for each model
combined_no_impute <- combined_no_impute %>%
  dplyr::select(term, 
         OR_Model_1, CI_Model_1, 
         OR_Model_2, CI_Model_2, 
         OR_Model_3, CI_Model_3)
# Print the final combined table
print(combined_no_impute)
## # A tibble: 26 × 7
##    term        OR_Model_1 CI_Model_1 OR_Model_2 CI_Model_2 OR_Model_3 CI_Model_3
##    <chr>       <chr>      <chr>      <chr>      <chr>      <chr>      <chr>     
##  1 (Intercept) 0.88       (0.65, 1.… 0.19***    (0.12, 0.… 0.27***    (0.14, 0.…
##  2 area2       1.3**      (1.08, 1.… 1.16       (0.96, 1.… 1.15       (0.95, 1.…
##  3 education_… 0.64***    (0.53, 0.… 1.15       (0.91, 1.… 1.14       (0.91, 1.…
##  4 education_… 0.6***     (0.49, 0.… 1.09       (0.86, 1.… 1.08       (0.86, 1.…
##  5 education_… 0.28***    (0.21, 0.… 0.49***    (0.36, 0.… 0.49***    (0.36, 0.…
##  6 education_… 0.03***    (0.01, 0.… 0.06***    (0.01, 0.… 0.06***    (0.01, 0.…
##  7 education_… 0.04***    (0.02, 0.… 0.08***    (0.04, 0.… 0.08***    (0.04, 0.…
##  8 wealth_ind… 0.54***    (0.45, 0.… 0.95       (0.77, 1.… 0.95       (0.77, 1.…
##  9 wealth_ind… 0.38***    (0.31, 0.… 0.69**     (0.54, 0.… 0.7**      (0.54, 0.…
## 10 wealth_ind… 0.47***    (0.37, 0.… 0.85       (0.65, 1.… 0.86       (0.65, 1.…
## # ℹ 16 more rows
  1. (Intercept) Odds Ratios and Significance:
  • In Model 1, the odds ratio for the intercept is 0.88, which indicates a slight negative association, but it’s not statistically significant, as indicated by the absence of asterisks.
  • In Model 2, the intercept has an OR of 0.19, which is highly significant (indicated by three asterisks) and suggests a much stronger negative association.
  • Model 3 shows a similar pattern to Model 2, with a significant OR of 0.27, indicating a robust negative association.
  1. Area2:
  • This term has an OR of 1.3 in Model 1 and is statistically significant (denoted by two asterisks), suggesting a moderately positive association with the outcome.
  • In Models 2 and 3, the ORs are 1.16 and 1.15, respectively, showing a slight positive association, but these are not statistically significant.
  1. Education Level Categories:
  • All education levels in Model 1 show a significant negative association (ORs less than 1, with three asterisks), meaning higher education levels are associated with lower odds of the outcome.
  • In Models 2 and 3, education_level1 and education_level2 lose their statistical significance (ORs 1.15 and 1.09 in Model 2, and 1.14 and 1.08 in Model 3), indicating no strong association.
  • However, education_level3, education_level4, and education_level5 maintain their strong, significant negative association in Models 2 and 3, as indicated by the ORs (0.49, 0.06, and 0.08) and the presence of three asterisks.
  1. Wealth Index Categories:
  • In Model 1, all wealth index categories show a significant negative association with the outcome, with ORs ranging from 0.36 to 0.54 and marked with three asterisks.
  • The strength of this association decreases in Models 2 and 3, with ORs moving closer to 1 and varying levels of significance, showing that wealth’s impact on the outcome is less pronounced or potentially confounded by other variables in these models.
  1. Health Insurance:
  • There is no statistically significant association between health insurance and the outcome in any model, as indicated by ORs around 1 and the absence of asterisks.
  1. Current Contraceptive Use:
  • Similar to health insurance, this variable shows no significant association with the outcome in any of the models.
  1. Awareness of HIV/AIDS:
  • In Model 1, there is a significant negative association (OR 0.62, three asterisks), but this association becomes non-significant in Models 2 and 3.
  1. Access to Media:

This variable shows no consistent or significant association with the outcome across the three models.

  1. Region Variables in Models 2 and 3:
  • These variables are only included in the latter two models and show varying ORs, but none of them are statistically significant.
  1. Ethnicity Categories in Models 2 and 3:
  • Ethnicity shows significant positive associations in these models, particularly for ethnicity4, which has very high ORs and is highly significant.
  1. Interaction Terms ethnicity:access_to_media in Model 3:
  • The interaction terms involving ethnicity and access to media are introduced in Model 3, but none show statistical significance.

Models Validation

Likelihood Ratio Test (LRT)

# Model 1 vs. Model 2
anova(base_model_no_impute, enhanced_model_no_impute, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: child_marriage ~ area + education_level + wealth_index + health_insurance + 
##     current_contraceptive_use + awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity + 
##     wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      7576     6086.8                          
## 2      7568     5849.7  8   237.05 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The Model 2 provides a significantly better fit to the data compared to the Model 1, as indicated by the large decrease in residual deviance and the very small p-value (< 2.2e-16).

# Model 1 vs. Model 3
anova(base_model_no_impute, interaction_model_no_impute, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: child_marriage ~ area + education_level + wealth_index + health_insurance + 
##     current_contraceptive_use + awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity + 
##     wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media + ethnicity:access_to_media
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      7576     6086.8                          
## 2      7565     5845.7 11   241.13 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The Model 3 provides a significantly better fit to the data compared to the Model 1, as indicated by the large decrease in residual deviance and the very small p-value (< 2.2e-16).

# Model 2 vs. Model 3
anova(enhanced_model_no_impute, interaction_model_no_impute, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: child_marriage ~ area + region + education_level + ethnicity + 
##     wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media
## Model 2: child_marriage ~ area + region + education_level + ethnicity + 
##     wealth_index + health_insurance + current_contraceptive_use + 
##     awareness_hiv_aids + access_to_media + ethnicity:access_to_media
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      7568     5849.7                     
## 2      7565     5845.7  3   4.0736   0.2536

The residual deviance in Model 3 is slightly lower compared to Model 1, indicating a potential improvement in model fit but not significant.

The p-value of 0.251 is not below the common significance level. This indicates that the addition of the interaction term ethnicity:access_to_media in Model 3 does not significantly improve the model fit compared to Model 2. In simpler terms, the interaction between ethnicity and access_to_media does not seem to have a significant effect on predicting child marriage.

ROC Curve and AUC

invisible(plot(roc(og_df$child_marriage,
                   fitted(base_model_no_impute)),
               col = "red",
               main = "ROC Curve: \nModel 1 (red) vs. Model 2 (green) vs. Model 3 (blue)"))

invisible(plot(roc(og_df$child_marriage,
                   fitted(enhanced_model_no_impute)),
               col = "green",
               add = T))

invisible(plot(roc(og_df$child_marriage,
                   fitted(interaction_model_no_impute)),
               print.auc = T,
               col = "blue",
               add = T))

# For each model
roc_response_model_1 <- roc(og_df$child_marriage, fitted(base_model_no_impute))
auc_model_1 <- auc(roc_response_model_1)

roc_response_model_2 <- roc(og_df$child_marriage, fitted(enhanced_model_no_impute))
auc_model_2 <- auc(roc_response_model_2)

roc_response_model_3 <- roc(og_df$child_marriage, fitted(interaction_model_no_impute))
auc_model_3 <- auc(roc_response_model_3)

# Compare AUC values
print(paste("AUC Model 1:", auc_model_1))
## [1] "AUC Model 1: 0.78337409946712"
print(paste("AUC Model 2:", auc_model_2))
## [1] "AUC Model 2: 0.79976685814396"
print(paste("AUC Model 3:", auc_model_3))
## [1] "AUC Model 3: 0.800632829652589"
  1. Model 1 (AUC: 0.7465):

An AUC of 0.5 represents a model with no discriminative ability (akin to random guessing), while an AUC of 1.0 represents perfect discrimination. So, my model is performing substantially better than random guessing.

The AUC of 0.7 indicates that the Model 1 has good discriminative ability. In other words, it is capable of distinguishing between cases and controls with a high degree of accuracy.

  1. Model 2 (AUC: 0.7661):

This model shows a slight improvement in AUC over the Model 1. The increase suggests that the additional variables (or adjustments) I made in the Model 2 contribute positively to its ability to differentiate between cases and controls. The difference in AUC between the Model 1 and Model 2, while modest, is still meaningful, especially in practical, real-world contexts.

  1. Model 3 (AUC: 0.7672):

The Model 3 shows a very slight improvement in AUC over the Model 2. This indicates that adding interaction terms provides a marginal improvement in the model’s discriminatory power. However, the improvement is very minimal, which aligns with my earlier findings that the interaction terms did not significantly improve the model fit.

  1. Overall:

All models demonstrate good ability to distinguish between cases and controls. An AUC greater than 0.7 is generally considered acceptable, and my models are above 0.7 and close to 0.8. The Model 2 and Model 3 only show marginal improvements in AUC compared to the Model 1. This suggests that while the additional complexity (more variables and interaction terms) does contribute slightly to model performance, the gains are not substantial.

Residuals Plot

binnedplot(fitted(interaction_model_no_impute),
           residuals(interaction_model_no_impute, type = "response"),
           nclass = NULL,
           xlab = "Expected Values",
           ylab = "Average Residuals",
           main = "Binned Residual Plot",
           cex.pts = 1,
           col.int = "gray")

Hosmer-Lemeshow Test

# 1. Hosmer-Lemeshow Test for the Model 1
hoslem.test(base_model_no_impute$y, fitted(base_model_no_impute), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  base_model_no_impute$y, fitted(base_model_no_impute)
## X-squared = 15.295, df = 8, p-value = 0.05366
# 2. Hosmer-Lemeshow Test for the Model 2 (Baseline + Fixed Effects)
hoslem.test(enhanced_model_no_impute$y, fitted(enhanced_model_no_impute), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  enhanced_model_no_impute$y, fitted(enhanced_model_no_impute)
## X-squared = 10.076, df = 8, p-value = 0.2597
# 3. Hosmer-Lemeshow Test for the Model 3 (Baseline + Fixed Effects + Interaction Terms)
hoslem.test(interaction_model_no_impute$y, fitted(interaction_model_no_impute), g=10)
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  interaction_model_no_impute$y, fitted(interaction_model_no_impute)
## X-squared = 8.8531, df = 8, p-value = 0.3548

A large p-value (>0.05) indicates a good fit, meaning that there’s no significant difference between the observed and predicted values. Through each model, the p-value increases which suggests that our decision to include fixed effects and interaction terms are significant.

Accessing Multicollinearity (VIF)

# VIFs check (A VIF value > 5 indicates high multicollinearity)
# Base model
model_1_vif <- vif(base_model_no_impute)
print(model_1_vif)
##                               GVIF Df GVIF^(1/(2*Df))
## area                      1.178945  1        1.085793
## education_level           1.745961  5        1.057313
## wealth_index              1.613551  4        1.061629
## health_insurance          1.029280  1        1.014534
## current_contraceptive_use 1.009914  1        1.004945
## awareness_hiv_aids        1.483532  1        1.218003
## access_to_media           1.216401  1        1.102906
# Enhanced model
model_2_vif <- vif(enhanced_model_no_impute)
print(model_2_vif)
##                               GVIF Df GVIF^(1/(2*Df))
## area                      1.311056  1        1.145014
## region                    5.810682  5        1.192402
## education_level           2.259331  5        1.084921
## ethnicity                 7.849140  3        1.409733
## wealth_index              2.856573  4        1.140199
## health_insurance          1.087155  1        1.042667
## current_contraceptive_use 1.025894  1        1.012864
## awareness_hiv_aids        1.784748  1        1.335945
## access_to_media           1.260862  1        1.122881
# Interacton model
model_3_vif <- vif(interaction_model_no_impute)
## there are higher-order terms (interactions) in this model
## consider setting type = 'predictor'; see ?vif
print(model_3_vif)
##                                  GVIF Df GVIF^(1/(2*Df))
## area                         1.314708  1        1.146607
## region                       5.880634  5        1.193830
## education_level              2.268331  5        1.085352
## ethnicity                 1256.569385  3        3.284968
## wealth_index                 2.897285  4        1.142218
## health_insurance             1.088549  1        1.043336
## current_contraceptive_use    1.027229  1        1.013523
## awareness_hiv_aids           1.790388  1        1.338054
## access_to_media              6.811523  1        2.609890
## ethnicity:access_to_media  853.958141  3        3.080156
  • Model 1 has the least concern with multicollinearity.
  • Model 2 introduces region and ethnicity, with a mild increase in multicollinearity, but not at alarming levels.
  • Model 3 exhibits more noticeable multicollinearity, particularly with ethnicity, access_to_media and its interaction with one another`; their VIF values are significantly higher (around 3.27, 2.61, and 3.07, respectively). Although these values are below 5, the increase is substantial compared to the previous models and could start to affect the reliability and interpretability of the regression coefficients for these variables.
  • High multicollinearity in models, especially those with interaction terms, doesn’t invalidate the model but does complicate the interpretation of specific coefficients.

Assessing AIC and BIC

# Create a data frame to hold AIC and BIC values
aic_bic_comparison <- data.frame(
    Model = c("Model 1", "Model 2", "Model 3"),
    AIC = c(AIC(base_model_no_impute), AIC(enhanced_model_no_impute), AIC(interaction_model_no_impute)),
    BIC = c(BIC(base_model_no_impute), BIC(enhanced_model_no_impute), BIC(interaction_model_no_impute))
)

# Print the table
print(aic_bic_comparison)
##     Model      AIC      BIC
## 1 Model 1 6116.798 6220.819
## 2 Model 2 5895.745 6055.243
## 3 Model 3 5897.671 6077.974

Model 2 has the lowest AIC and BIC values of all, indicating that it might be the best model among the three in terms of balancing fit and complexity.